#Loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)

Exploratory Data Analysis (EDA)

🗂️⏳ Getting started: Loading datasets into working directory

dataset_crime <- read_csv("crime_2020_2023.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 89335 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (11): Date Rptd, DATE OCC, AREA NAME, Crm Cd Desc, Category, Vict Sex, ...
## dbl   (7): DR_NO, Vict Age, Crm Cd 1, Crm Cd 2, Crm Cd 3, LAT, LON
## lgl   (1): Crm Cd 4
## time  (1): TIME OCC
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset_weather <- read_csv("weather_2020_2023.csv")
## Rows: 1461 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (4): temp, feelslike, humidity, windspeed
## date (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Discovering the datasets

🔎📝 Initial Exploration Tasks

How many variables are in each dataset?

Crime dataset

str(dataset_crime)
## spc_tbl_ [89,335 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ DR_NO       : num [1:89335] 2.00e+08 2.02e+08 2.20e+08 2.22e+08 2.02e+08 ...
##  $ Date Rptd   : chr [1:89335] "09/04/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/14/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
##  $ DATE OCC    : chr [1:89335] "08/30/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/11/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
##  $ TIME OCC    : 'hms' num [1:89335] 11:00:00 22:30:00 08:00:00 00:01:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ AREA NAME   : chr [1:89335] "Southwest" "Devonshire" "Southwest" "Mission" ...
##  $ Crm Cd Desc : chr [1:89335] "VEHICLE - STOLEN" "BURGLARY" "THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)" "THEFT OF IDENTITY" ...
##  $ Category    : chr [1:89335] "THEFT" "BURGLARY" "THEFT" "THEFT" ...
##  $ Vict Age    : num [1:89335] 0 47 0 43 46 26 35 36 26 30 ...
##  $ Vict Sex    : chr [1:89335] NA "F" NA "F" ...
##  $ Vict Descent: chr [1:89335] NA "A" NA "H" ...
##  $ Premis Desc : chr [1:89335] "STREET" "SINGLE FAMILY DWELLING" "STREET" "SINGLE FAMILY DWELLING" ...
##  $ Weapon Desc : chr [1:89335] NA NA NA NA ...
##  $ Status Desc : chr [1:89335] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
##  $ Crm Cd 1    : num [1:89335] 510 310 420 354 354 230 230 624 210 230 ...
##  $ Crm Cd 2    : num [1:89335] NA 998 NA NA NA NA NA NA NA NA ...
##  $ Crm Cd 3    : num [1:89335] NA NA NA NA NA NA NA NA NA NA ...
##  $ Crm Cd 4    : logi [1:89335] NA NA NA NA NA NA ...
##  $ LOCATION    : chr [1:89335] "2000 W  43RD                         ST" "12200    LOUISE                       AV" "2100 S  HOOVER                       ST" "10000 N  SEPULVEDA                    BL" ...
##  $ LAT         : num [1:89335] 34 34.3 34 34.3 34 ...
##  $ LON         : num [1:89335] -118 -119 -118 -118 -118 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   DR_NO = col_double(),
##   ..   `Date Rptd` = col_character(),
##   ..   `DATE OCC` = col_character(),
##   ..   `TIME OCC` = col_time(format = ""),
##   ..   `AREA NAME` = col_character(),
##   ..   `Crm Cd Desc` = col_character(),
##   ..   Category = col_character(),
##   ..   `Vict Age` = col_double(),
##   ..   `Vict Sex` = col_character(),
##   ..   `Vict Descent` = col_character(),
##   ..   `Premis Desc` = col_character(),
##   ..   `Weapon Desc` = col_character(),
##   ..   `Status Desc` = col_character(),
##   ..   `Crm Cd 1` = col_double(),
##   ..   `Crm Cd 2` = col_double(),
##   ..   `Crm Cd 3` = col_double(),
##   ..   `Crm Cd 4` = col_logical(),
##   ..   LOCATION = col_character(),
##   ..   LAT = col_double(),
##   ..   LON = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(dataset_crime)
##      DR_NO            Date Rptd           DATE OCC           TIME OCC       
##  Min.   :200100508   Length:89335       Length:89335       Length:89335     
##  1st Qu.:210306848   Class :character   Class :character   Class1:hms       
##  Median :220313378   Mode  :character   Mode  :character   Class2:difftime  
##  Mean   :217004722                                         Mode  :numeric   
##  3rd Qu.:230206364                                                          
##  Max.   :242104022                                                          
##                                                                             
##   AREA NAME         Crm Cd Desc          Category            Vict Age    
##  Length:89335       Length:89335       Length:89335       Min.   :-2.00  
##  Class :character   Class :character   Class :character   1st Qu.: 6.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :31.00  
##                                                           Mean   :29.63  
##                                                           3rd Qu.:45.00  
##                                                           Max.   :99.00  
##                                                                          
##    Vict Sex         Vict Descent       Premis Desc        Weapon Desc       
##  Length:89335       Length:89335       Length:89335       Length:89335      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Status Desc           Crm Cd 1        Crm Cd 2        Crm Cd 3    
##  Length:89335       Min.   :110.0   Min.   :210.0   Min.   :626.0  
##  Class :character   1st Qu.:331.0   1st Qu.:998.0   1st Qu.:998.0  
##  Mode  :character   Median :442.0   Median :998.0   Median :998.0  
##                     Mean   :500.6   Mean   :957.7   Mean   :983.9  
##                     3rd Qu.:626.0   3rd Qu.:998.0   3rd Qu.:998.0  
##                     Max.   :956.0   Max.   :999.0   Max.   :999.0  
##                                     NA's   :82702   NA's   :89109  
##  Crm Cd 4         LOCATION              LAT             LON        
##  Mode:logical   Length:89335       Min.   : 0.00   Min.   :-118.7  
##  NA's:89335     Class :character   1st Qu.:34.01   1st Qu.:-118.4  
##                 Mode  :character   Median :34.06   Median :-118.3  
##                                    Mean   :33.98   Mean   :-118.0  
##                                    3rd Qu.:34.16   3rd Qu.:-118.3  
##                                    Max.   :34.33   Max.   :   0.0  
## 
#Checking number of variables
ncol(dataset_crime) 
## [1] 20

🧐 ➡️ There are 20 variables in Crime datase

Weather dataset

str(dataset_weather)
## spc_tbl_ [1,461 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ datetime : Date[1:1461], format: "2020-01-01" "2020-01-02" ...
##  $ temp     : num [1:1461] 15.2 14.8 16.2 14.7 14.5 17.1 17.6 14.3 14 13.3 ...
##  $ feelslike: num [1:1461] 15.2 14.8 16.2 14.7 14.5 17.1 17.6 14.3 14 13.3 ...
##  $ humidity : num [1:1461] 48.9 69.6 53 63.6 59.1 34.9 32.3 68.1 58.3 61.2 ...
##  $ windspeed: num [1:1461] 12 11.1 8.1 6.6 8.6 7.4 7.8 9.8 12.7 10 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   datetime = col_date(format = ""),
##   ..   temp = col_double(),
##   ..   feelslike = col_double(),
##   ..   humidity = col_double(),
##   ..   windspeed = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(dataset_weather)
##     datetime               temp        feelslike        humidity    
##  Min.   :2020-01-01   Min.   : 8.3   Min.   : 7.50   Min.   :15.00  
##  1st Qu.:2020-12-31   1st Qu.:15.1   1st Qu.:15.10   1st Qu.:55.20  
##  Median :2021-12-31   Median :18.1   Median :18.00   Median :66.50  
##  Mean   :2021-12-31   Mean   :18.2   Mean   :18.18   Mean   :62.21  
##  3rd Qu.:2022-12-31   3rd Qu.:21.1   3rd Qu.:21.10   3rd Qu.:71.80  
##  Max.   :2023-12-31   Max.   :32.4   Max.   :32.00   Max.   :93.90  
##    windspeed    
##  Min.   : 1.30  
##  1st Qu.:10.10  
##  Median :12.50  
##  Mean   :13.58  
##  3rd Qu.:16.60  
##  Max.   :42.80
#Checking number of variables
ncol(dataset_weather) 
## [1] 5

🧐 ➡ There are 5 variables in Weather dataset

How many observations are there for each variable?

Crime dataset

nrow(dataset_crime) 
## [1] 89335

🧐 ➡ There are 89335 observations in Crime dataset

Weather dataset

nrow(dataset_weather) 
## [1] 1461

🧐 ➡ There are 1461 observations in Weather dataset

Interpret 5 variables from the Crime dataset

  1. “DR_NO”: ID number of the report
  2. “Date Rptd”: Date reported
  3. “DATE OCC”: Date of occurrence
  4. “TIME OCC”: Time of occurrence
  5. “AREA NAME”: Name of the area where the crime occured
  6. “Crm Cd”: Crime code
  7. “LAT”: Latitude
  8. “LON”: Longtitude

What types of variables are present in the dataset?

sapply(dataset_crime, class) 
## $DR_NO
## [1] "numeric"
## 
## $`Date Rptd`
## [1] "character"
## 
## $`DATE OCC`
## [1] "character"
## 
## $`TIME OCC`
## [1] "hms"      "difftime"
## 
## $`AREA NAME`
## [1] "character"
## 
## $`Crm Cd Desc`
## [1] "character"
## 
## $Category
## [1] "character"
## 
## $`Vict Age`
## [1] "numeric"
## 
## $`Vict Sex`
## [1] "character"
## 
## $`Vict Descent`
## [1] "character"
## 
## $`Premis Desc`
## [1] "character"
## 
## $`Weapon Desc`
## [1] "character"
## 
## $`Status Desc`
## [1] "character"
## 
## $`Crm Cd 1`
## [1] "numeric"
## 
## $`Crm Cd 2`
## [1] "numeric"
## 
## $`Crm Cd 3`
## [1] "numeric"
## 
## $`Crm Cd 4`
## [1] "logical"
## 
## $LOCATION
## [1] "character"
## 
## $LAT
## [1] "numeric"
## 
## $LON
## [1] "numeric"
sapply(dataset_weather, class) 
##  datetime      temp feelslike  humidity windspeed 
##    "Date" "numeric" "numeric" "numeric" "numeric"

Are there any issues with the variable types that could complicate future analysis?

Crime dataset

“Date Rptd” & DATE OCC” are stored as character, instead of date data “Category” is stored as character, but could be better if being stored as categorical data “Vict Age” is stored as numeric, but there might be error related to negative value due to human error when inputting

Weather dataset

No potential issues founded

How many missing values are there per variable?

Crime dataset

filtered_crime_missing_value <- sapply(dataset_crime, function(x) sum(is.na(x)))
filtered_crime_missing_value
##        DR_NO    Date Rptd     DATE OCC     TIME OCC    AREA NAME  Crm Cd Desc 
##            0            0            0            0            0            0 
##     Category     Vict Age     Vict Sex Vict Descent  Premis Desc  Weapon Desc 
##            0            0        11655        11655           54        58054 
##  Status Desc     Crm Cd 1     Crm Cd 2     Crm Cd 3     Crm Cd 4     LOCATION 
##            0            0        82702        89109        89335            0 
##          LAT          LON 
##            0            0
# Filter only variables with missing value 
filtered_crime_missing_value[filtered_crime_missing_value > 0] 
##     Vict Sex Vict Descent  Premis Desc  Weapon Desc     Crm Cd 2     Crm Cd 3 
##        11655        11655           54        58054        82702        89109 
##     Crm Cd 4 
##        89335

Weather dataset

filtered_weather_missing_value <- sapply(dataset_weather, function(x) sum(is.na(x)))
filtered_weather_missing_value
##  datetime      temp feelslike  humidity windspeed 
##         0         0         0         0         0
# Filter only variables with missing value 
filtered_crime_missing_value[filtered_weather_missing_value > 0] 
## named integer(0)

🧐 ➡ There is no missing value in Weather dataset❗

Which variables have the highest percentage of missing values?

Crime dataset

# Calculating percentage of containing missing value of each variable
missing_value_crime_percentage <- sapply(dataset_crime, function(x) sum(is.na(x)) / length(x) * 100) 

# Sorting variables with missing value from highest to lowest
sort(missing_value_crime_percentage[missing_value_crime_percentage > 0], decreasing = TRUE) 
##     Crm Cd 4     Crm Cd 3     Crm Cd 2  Weapon Desc     Vict Sex Vict Descent 
## 100.00000000  99.74701965  92.57513852  64.98460850  13.04639839  13.04639839 
##  Premis Desc 
##   0.06044663

🔢️📊 Initial Data Visualization

Plot a histogram of age

dataset_crime$`Vict Age` <- as.numeric(dataset_crime$`Vict Age`)

ggplot(dataset_crime, aes(`Vict Age`)) +
  geom_histogram(fill = "pink", color = "black", binwidth = 5) +  
  labs(title="Distribution of Victim Age",
       x="Age",
       y="Number of cases") +
  theme_minimal() 

Plot a barplot of descent and note down the 5 most affected victim descents

ggplot(dataset_crime, aes(x = `Vict Descent`)) +
  geom_bar(fill = "lightblue", color = "black") +
  labs(title = "Distribution of Victim Descent",
       x = "Victim Descent",
       y = "Number of Cases") +
  theme_minimal()

Plot the victim’s gender distribution

# Calculating percentage of each gender
vict_gender_counts <- dataset_crime %>%
  count(`Vict Sex`) %>%
  mutate(`Vict Gender Percentage` = n / sum(n) * 100)

# Plot the gender distribution using pie chart
ggplot(vict_gender_counts, aes(x = "", y = `Vict Gender Percentage`, fill = `Vict Sex`)) +
  geom_bar(stat = "identity", width = 1, color = "black") +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(`Vict Gender Percentage`, 1), "%")), 
            position = position_stack(vjust = 0.5), color = "white") +
  labs(title = "Distribution of Victim Gender in Percentage",
       x = "",
       y = "",
       fill = "Victim Gender") +
  scale_fill_manual(values = c("lightblue","orange", "lightpink", "lightgreen")) +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        axis.text.y = element_blank(),  
        axis.title.y = element_blank()) 

How many cases remain open? Create a bar chart to visualize this

# Checking how many kind of status
table(dataset_crime$`Status Desc`)
## 
## Adult Arrest  Adult Other  Invest Cont   Juv Arrest    Juv Other          UNK 
##         8195        10321        70347          311          160            1
# Counting number of cases of each status
case_status_counts <- dataset_crime %>%
  count(`Status Desc`)

# Filter cases remain open
open_cases <- case_status_counts %>%
  filter(`Status Desc` == "Invest Cont") %>%
  pull(n)

# Plot the case status using bar chart
ggplot(case_status_counts, aes(x = `Status Desc`, y = n, fill = `Status Desc`)) +
  geom_bar(stat = "identity", color = "black", width = 0.5) +
  labs(title = "Distribution of Current Investigation Status",
       x = "Case Status",
       y = "Number of Cases") +
  theme_minimal() +
  theme(legend.position = "none") 

What do you think about this plot? Write down your thoughts about what’s good and especially bad about this plot

✅ : - Provide all needed information of the data - Readable and understandable

❌ : - Could have been better if there is annotation to clarify exactly how many cases of each age group (see below for a revised version)

✨ Revised version:

dataset_crime$AgeGroup <- cut(dataset_crime$`Vict Age`, 
                              breaks = seq(0, 100, by = 10), 
                              right = FALSE, 
                              labels = paste(seq(0, 95, by = 10), seq(5, 100, by = 10), sep = "-"))

ggplot(dataset_crime, aes(AgeGroup)) +
  geom_bar(fill = "blue", color = "black") +  
  labs(title="Distribution of Victim Age",
       x="Age Group",
       y="Number of cases") +
  theme_minimal() +
  geom_text(stat='count', aes(label=after_stat(count)), vjust=-0.5, size=3)

Data Cleaning and Transformation

👮🧹 Cleaning the Crime Dataset

Review the dataset and drop any columns that are mostly empty, irrelevant or unlikely to be used in the analysis

As we analyzed above, the variables having the most missing values are:

  • Crm Cd 4
  • Crm Cd 3
  • Crm Cd 2
  • Weapon Desc

Furthermore, below are columns that will also be removed as they are irrelevant or less likely to be used in the analysis:

  • Premis Desc (Might be redundant)
  • Location (Full address might not be necessary as we also have geocoordinates: LAT & LON)

🧐 ➡ Therefore, we will remove those columns in the next step of cleaning the dataset.

Make the column names lowercase and remove any spaces and replace them with underscores “_” to follow naming conventions. You can also choose entirely new names that are more meaningful

# Convert column names to lowercase and replace spaces with "_"
colnames(dataset_crime) <- tolower(gsub(" ", "_", colnames(dataset_crime)))
print(colnames(dataset_crime))
##  [1] "dr_no"        "date_rptd"    "date_occ"     "time_occ"     "area_name"   
##  [6] "crm_cd_desc"  "category"     "vict_age"     "vict_sex"     "vict_descent"
## [11] "premis_desc"  "weapon_desc"  "status_desc"  "crm_cd_1"     "crm_cd_2"    
## [16] "crm_cd_3"     "crm_cd_4"     "location"     "lat"          "lon"         
## [21] "agegroup"
# Filter columns that have many missing value, are irrelevant or less likely to be used in the analysis 
dataset_crime_cleaned <- dataset_crime %>%
  select(-crm_cd_2, -crm_cd_3, -crm_cd_4, -weapon_desc, -location, -premis_desc) %>%
  # Rename the remaining columns
  rename(
    report_number = dr_no,
    date_reported = date_rptd,
    date_occurrence = date_occ,
    time_occurrence = time_occ,
    crime_code_description = crm_cd_desc,
    victim_age = vict_age,
    victim_sex = vict_sex,
    victim_descent = vict_descent,
    status_description = status_desc,
    crime_code_1 = crm_cd_1,
    latitude = lat,
    longitude = lon
  )

# Check the updated dataset
str(dataset_crime_cleaned)
## tibble [89,335 × 15] (S3: tbl_df/tbl/data.frame)
##  $ report_number         : num [1:89335] 2.00e+08 2.02e+08 2.20e+08 2.22e+08 2.02e+08 ...
##  $ date_reported         : chr [1:89335] "09/04/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/14/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
##  $ date_occurrence       : chr [1:89335] "08/30/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/11/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
##  $ time_occurrence       : 'hms' num [1:89335] 11:00:00 22:30:00 08:00:00 00:01:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ area_name             : chr [1:89335] "Southwest" "Devonshire" "Southwest" "Mission" ...
##  $ crime_code_description: chr [1:89335] "VEHICLE - STOLEN" "BURGLARY" "THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)" "THEFT OF IDENTITY" ...
##  $ category              : chr [1:89335] "THEFT" "BURGLARY" "THEFT" "THEFT" ...
##  $ victim_age            : num [1:89335] 0 47 0 43 46 26 35 36 26 30 ...
##  $ victim_sex            : chr [1:89335] NA "F" NA "F" ...
##  $ victim_descent        : chr [1:89335] NA "A" NA "H" ...
##  $ status_description    : chr [1:89335] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
##  $ crime_code_1          : num [1:89335] 510 310 420 354 354 230 230 624 210 230 ...
##  $ latitude              : num [1:89335] 34 34.3 34 34.3 34 ...
##  $ longitude             : num [1:89335] -118 -119 -118 -118 -118 ...
##  $ agegroup              : Factor w/ 10 levels "0-5","10-15",..: 1 5 1 5 5 3 4 4 3 4 ...

Replace any missing values in the dataset with NA

# Replace any missing values with NA 
dataset_crime_cleaned[is.na(dataset_crime_cleaned)] <- NA

# Check the updated dataset
str(dataset_crime_cleaned)
## tibble [89,335 × 15] (S3: tbl_df/tbl/data.frame)
##  $ report_number         : num [1:89335] 2.00e+08 2.02e+08 2.20e+08 2.22e+08 2.02e+08 ...
##  $ date_reported         : chr [1:89335] "09/04/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/14/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
##  $ date_occurrence       : chr [1:89335] "08/30/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/11/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
##  $ time_occurrence       : 'hms' num [1:89335] 11:00:00 22:30:00 08:00:00 00:01:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ area_name             : chr [1:89335] "Southwest" "Devonshire" "Southwest" "Mission" ...
##  $ crime_code_description: chr [1:89335] "VEHICLE - STOLEN" "BURGLARY" "THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)" "THEFT OF IDENTITY" ...
##  $ category              : chr [1:89335] "THEFT" "BURGLARY" "THEFT" "THEFT" ...
##  $ victim_age            : num [1:89335] 0 47 0 43 46 26 35 36 26 30 ...
##  $ victim_sex            : chr [1:89335] NA "F" NA "F" ...
##  $ victim_descent        : chr [1:89335] NA "A" NA "H" ...
##  $ status_description    : chr [1:89335] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
##  $ crime_code_1          : num [1:89335] 510 310 420 354 354 230 230 624 210 230 ...
##  $ latitude              : num [1:89335] 34 34.3 34 34.3 34 ...
##  $ longitude             : num [1:89335] -118 -119 -118 -118 -118 ...
##  $ agegroup              : Factor w/ 10 levels "0-5","10-15",..: 1 5 1 5 5 3 4 4 3 4 ...

Convert the date column into a proper date format

# Revise the date format
dataset_crime_cleaned <- dataset_crime_cleaned %>%
  mutate(
    date_reported = as.Date(date_reported, format = "%m/%d/%Y"),
    date_occurrence = as.Date(date_occurrence, format = "%m/%d/%Y")
  )

# Check the updated columns
str(dataset_crime_cleaned$date_reported)
##  Date[1:89335], format: "2020-09-04" "2020-03-06" "2022-01-14" "2022-08-03" "2020-01-31" ...
str(dataset_crime_cleaned$date_occurrence)
##  Date[1:89335], format: "2020-08-30" "2020-03-06" "2022-01-11" "2022-08-03" "2020-01-22" ...

Filter out any entries with invalid geocoordinates (LAT and LON)

# Check rows with missing or invalid geocoordinates
invalid_geo_coords <- dataset_crime_cleaned %>%
  filter(
    is.na(latitude) | is.na(longitude) | 
    latitude < -90 | latitude > 90 | 
    longitude < -180 | longitude > 180
  )

# Check if there are any invalid rows and print them
if (nrow(invalid_geo_coords) > 0) {
  print("Rows with invalid geocoordinates:")
  print(invalid_geo_coords)
    # Filter out rows with invalid geocoordinates values
    dataset_crime_cleaned <- dataset_crime_cleaned %>%
      filter(
        !is.na(latitude) & !is.na(longitude) & 
        latitude >= -90 & latitude <= 90 & 
        longitude >= -180 & longitude <= 180
  )
    # Check the updated columns
    str(dataset_crime_cleaned$latitude)
    str(dataset_crime_cleaned$longitude)
} else {
  print("No invalid geocoordinates found.")
}
## [1] "No invalid geocoordinates found."

🧐 ➡ There is no invalid in invalid geocoordinates❗

Filter out entries with invalid age (age = 0)

# Filter out rows with age = 0
dataset_crime_cleaned <- dataset_crime_cleaned %>%
  filter(victim_age != 0)

# Check the updated column
str(dataset_crime_cleaned$victim_age)
##  num [1:67224] 47 43 46 26 35 36 26 30 42 55 ...
#Check if there are any age = 0 entries
any(dataset_crime_cleaned$victim_age == 0)  
## [1] FALSE
str(dataset_crime_cleaned)
## tibble [67,224 × 15] (S3: tbl_df/tbl/data.frame)
##  $ report_number         : num [1:67224] 2.02e+08 2.22e+08 2.02e+08 2.22e+08 2.02e+08 ...
##  $ date_reported         : Date[1:67224], format: "2020-03-06" "2022-08-03" ...
##  $ date_occurrence       : Date[1:67224], format: "2020-03-06" "2022-08-03" ...
##  $ time_occurrence       : 'hms' num [1:67224] 22:30:00 00:01:00 12:00:00 15:00:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ area_name             : chr [1:67224] "Devonshire" "Mission" "Southeast" "Foothill" ...
##  $ crime_code_description: chr [1:67224] "BURGLARY" "THEFT OF IDENTITY" "THEFT OF IDENTITY" "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT" ...
##  $ category              : chr [1:67224] "BURGLARY" "THEFT" "THEFT" "ASSAULT" ...
##  $ victim_age            : num [1:67224] 47 43 46 26 35 36 26 30 42 55 ...
##  $ victim_sex            : chr [1:67224] "F" "F" "F" "F" ...
##  $ victim_descent        : chr [1:67224] "A" "H" "H" "H" ...
##  $ status_description    : chr [1:67224] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
##  $ crime_code_1          : num [1:67224] 310 354 354 230 230 624 210 230 330 341 ...
##  $ latitude              : num [1:67224] 34.3 34.3 34 34.3 34.3 ...
##  $ longitude             : num [1:67224] -119 -118 -118 -118 -118 ...
##  $ agegroup              : Factor w/ 10 levels "0-5","10-15",..: 5 5 5 3 4 4 3 4 5 6 ...

🌦🧹 Cleaning the Weather Dataset️

Convert the date column into a proper date format

# Check the class of the datetime column, format if needed
if (class(dataset_weather$datetime) != "Date") {
  dataset_weather$datetime <- as.Date(dataset_weather$datetime, format="%Y-%m-%d")
      # Check the updated columns
      str(dataset_weather$datetime)
} else {
  print("The datetime column is already in Date format.")
}
## [1] "The datetime column is already in Date format."

Data Visualization

🚨📈 Crime Rate Over Time

Plot the number of cases over time; Add title and axis labels to your plot(s)

#Line & Point plot
ggplot(dataset_crime_cleaned, aes(x = `date_occurrence`)) +
  geom_line(stat = "count", color = "navy") +  
  geom_point(stat = "count", color = "red", size = 0.15) +  
  scale_x_date(
    breaks = "6 months",  
    labels = scales::date_format("%Y-%m") 
  ) +
  labs(title = "Number of Cases Over Time",
       x = "Date",
       y = "Number of Cases") +
  theme_minimal() 

#Barplot
ggplot(dataset_crime_cleaned, aes(x = `date_occurrence`)) +
  geom_bar(fill = "navy") +  
  scale_x_date(
    breaks = "6 months",  
    labels = scales::date_format("%Y-%m") 
  ) +
  labs(title = "Number of Cases Over Time",
       x = "Date",
       y = "Number of Cases") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))

Try different themes

#Dark Theme
ggplot(dataset_crime_cleaned, aes(x = `date_occurrence`)) +
  geom_line(stat = "count") +  
  scale_x_date(
    breaks = "6 months",  
    labels = scales::date_format("%Y-%m") 
  ) +
  labs(title = "Number of Cases Over Time",
       x = "Date",
       y = "Number of Cases") +
  theme_dark()

#Excel Theme
ggplot(dataset_crime_cleaned, aes(x = `date_reported`)) +
  geom_line(stat = "count", color = "darkgreen") +  
  scale_x_date(
    breaks = "6 months",  
    labels = scales::date_format("%Y-%m") 
  ) +
  labs(title = "Number of Cases Over Time",
       x = "Date",
       y = "Number of Cases") +
  theme_excel()

Plot the rolling mean over 30 days over time

install.packages("zoo")
## 
## The downloaded binary packages are in
##  /var/folders/xk/9mmr645s72g8yf1651fpqg740000gn/T//RtmpUGS7x8/downloaded_packages
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Count number of cases per day
daily_cases <- dataset_crime_cleaned %>%
  group_by(date_occurrence) %>%
  summarise(Number_of_Cases = n()) %>%
  arrange(date_occurrence)

# Compute the 30-day rolling mean
daily_cases <- daily_cases %>%
  mutate(RollingMean_30d = rollmean(Number_of_Cases, k = 30, fill = NA, align = "right"))

# Plot the rolling mean over time
ggplot(na.omit(daily_cases), aes(x = date_occurrence)) +
  geom_line(aes(y = RollingMean_30d), color = "red", linewidth = 0.25) +  
  labs(title = "Number of Cases Over Time (Rolling Mean)",
       x = "Date",
       y = "Number of Cases") +
    scale_x_date(
    breaks = seq(min(dataset_crime_cleaned$date_occurrence), 
                 max(dataset_crime_cleaned$date_occurrence), 
                 by = "6 months"),  
    labels = scales::date_format("%Y-%m")
  ) +
  theme_minimal()

Plot the rolling mean over 30 days grouped by the five most affected victim descents

dataset_crime_cleaned$date_occurrence <- as.Date(dataset_crime_cleaned$date_occurrence, format = "%m/%d/%Y")

# The five most affected victim descents
top_victims <- dataset_crime_cleaned %>%
  group_by(victim_descent) %>%
  summarise(Number_of_Cases = n()) %>%
  top_n(5, Number_of_Cases) %>%
  pull(victim_descent)

# Filter the dataset to include only the top 5 victim descents
filtered_data <- dataset_crime_cleaned %>%
  filter(victim_descent %in% top_victims)

# Count number of cases per day, grouped by victim descent
daily_cases <- filtered_data %>%
  group_by(date_occurrence, victim_descent) %>%
  summarise(Number_of_Cases = n(), .groups = "drop") %>%
  arrange(date_occurrence)

# Compute the 30-day rolling mean 
daily_cases <- daily_cases %>%
  group_by(victim_descent) %>%
  mutate(RollingMean_30d = rollmean(Number_of_Cases, k = 30, fill = NA, align = "right")) 

# Plot the rolling mean over time for each of the five victim descents
ggplot(na.omit(daily_cases), aes(x = date_occurrence, y = RollingMean_30d, color = victim_descent, group = victim_descent)) +
  geom_line(linewidth = 0.25) +  
  labs(title = "30-Day Rolling Mean of Cases Over Time by Top Five Victim Descent",
       x = "Date",
       y = "Number of Cases (Rolling Mean)",
       color = "Victim Descent") +
  scale_x_date(
    breaks = seq(min(daily_cases$date_occurrence), max(daily_cases$date_occurrence), by = "6 months"),  
    labels = scales::date_format("%Y-%m") 
  ) +
  theme_minimal() 

🚔💥 Crime Types

# Count the number of crime types
crime_types <- length(unique(dataset_crime_cleaned$category))

# Show the result
print(paste("Number of crime types:", `crime_types`))
## [1] "Number of crime types: 14"
# Count the number of cases for each crime type
crime_types_counts <- dataset_crime_cleaned %>%
  group_by(category) %>%
  summarise(Number_of_Cases = n(), .groups = "drop")
crime_types_counts
## # A tibble: 14 × 2
##    category               Number_of_Cases
##    <chr>                            <int>
##  1 ASSAULT                          19085
##  2 BURGLARY                          9378
##  3 CHILDREN                           484
##  4 COURT/LEGAL VIOLATIONS            2029
##  5 FRAUD/FORGERY                      298
##  6 HOMICIDE                           174
##  7 OTHER                             2101
##  8 ROBBERY                           2816
##  9 SEX CRIMES                        1757
## 10 THEFT                            19839
## 11 THREATS                           3212
## 12 VANDALISM                         5969
## 13 VEHICLE CRIMES                       7
## 14 WEAPONS                             75

Plot the ten most common crime types in descending order

# Count the occurrences of each crime type
crime_counts <- dataset_crime_cleaned %>%
  count(category) %>%
  arrange(desc(n)) %>%
  top_n(10, n)

# Plot the ten most common crime types
ggplot(crime_counts, aes(x = reorder(category, n), y = n)) +
  geom_bar(stat = "identity", fill = "purple", color = "black", width = 0.5) +
  labs(title = "Top 10 Most Common Crime Types",
       x = "Crime Type",
       y = "Number of Cases") +
  coord_flip() +  # Flip the axes for better readability
  theme_minimal()

Plot the percentage of men and women affected by the 10 most common crime types

# Remove NA values and count the number of cases by crime type and gender
crime_gender_counts <- dataset_crime_cleaned %>%
  filter(category %in% names(sort(table(dataset_crime_cleaned$category), decreasing = TRUE)[1:10])) %>%
  filter(!is.na(victim_sex)) %>%
  group_by(category, victim_sex) %>%
  summarize(Number_of_Cases = n(), .groups = "drop")

# Calculate the percentage of men and women for each crime type
crime_gender_percentage <- crime_gender_counts %>%
  group_by(category) %>%
  mutate(Percentage = (Number_of_Cases / sum(Number_of_Cases)) * 100)

# Plot the percentage of men and women for the 10 most common crime types
ggplot(crime_gender_percentage, aes(x = reorder(category, Percentage), y = Percentage, fill = victim_sex)) +
  geom_bar(stat = "identity", position = "stack", color = "black", width = 0.5) +
  scale_fill_manual(values = c("pink", "lightblue", "lightgreen", "yellow")) + 
  labs(title = "Distribution of Gender for Top 10 Crime Types",
       x = "Crime Type",
       y = "Percentage",
       fill = "Gender") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Group by gender and crime category & Count the number of occurrences
crime_gender_counts <- dataset_crime_cleaned %>%
  filter(!is.na(victim_sex)) %>%
  group_by(victim_sex, category) %>%
  summarise(count = n(), .groups = "drop") 

# Create the plot
ggplot(crime_gender_counts, aes(x = reorder(category, count), y = count, fill = victim_sex)) +
  geom_bar(stat = "identity", position = "dodge", color = "black") +
  labs(
    title = "Distribution of Crime Types by Gender",  
    x = "Crime Type",                 
    y = "Numnber of Cases",             
    fill = "Victim Gender"                   
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Grouping and Merging Data

🤔📝 Hypothesis: “Hot temperatures directly influence aggression and violence.”

Find out the crime count per day

# Convert 'date_reported' to Date type (assuming 'date_reported' is the date column)
dataset_crime_cleaned$date_reported <- as.Date(dataset_crime_cleaned$date_occurrence, format = "%m/%d/%Y")

# Count the number of crimes per day
crime_count_per_day <- dataset_crime_cleaned %>%
  group_by(date_occurrence) %>%
  summarise(crime_count = n(), .groups = "drop")

# Show the result
crime_count_per_day
## # A tibble: 1,461 × 2
##    date_occurrence crime_count
##    <date>                <int>
##  1 2020-01-01              105
##  2 2020-01-02               48
##  3 2020-01-03               59
##  4 2020-01-04               44
##  5 2020-01-05               32
##  6 2020-01-06               41
##  7 2020-01-07               35
##  8 2020-01-08               37
##  9 2020-01-09               37
## 10 2020-01-10               51
## # ℹ 1,451 more rows

Merge daily temperatures and daily crime count

dataset_weather$datetime <- as.Date(dataset_weather$datetime, format = "%m/%d/%Y")
merged_data <- left_join(crime_count_per_day, dataset_weather, by = c("date_occurrence" = "datetime"))

merged_data
## # A tibble: 1,461 × 6
##    date_occurrence crime_count  temp feelslike humidity windspeed
##    <date>                <int> <dbl>     <dbl>    <dbl>     <dbl>
##  1 2020-01-01              105  15.2      15.2     48.9      12  
##  2 2020-01-02               48  14.8      14.8     69.6      11.1
##  3 2020-01-03               59  16.2      16.2     53         8.1
##  4 2020-01-04               44  14.7      14.7     63.6       6.6
##  5 2020-01-05               32  14.5      14.5     59.1       8.6
##  6 2020-01-06               41  17.1      17.1     34.9       7.4
##  7 2020-01-07               35  17.6      17.6     32.3       7.8
##  8 2020-01-08               37  14.3      14.3     68.1       9.8
##  9 2020-01-09               37  14        14       58.3      12.7
## 10 2020-01-10               51  13.3      13.3     61.2      10  
## # ℹ 1,451 more rows

Linear Regression

🎨📈 Visualization

Create a graph to visualize the Crime Count against the Temperature over the years

merged_data$date_occurrence <- as.Date(merged_data$date_occurrence)

# Calculate 30-day rolling mean for crime count per day and temperature
merged_data <- merged_data %>%
  arrange(date_occurrence) %>%
  mutate(
    rolling_crime_count = rollapply(crime_count, width = 30, FUN = mean, fill = NA, align = "right"),
    rolling_temperature = rollapply(temp, width = 30, FUN = mean, fill = NA, align = "right")
  )

coeff <- 0.47

# Create the plot
ggplot(na.omit(merged_data), aes(x = date_occurrence)) +
  geom_line(aes(y = rolling_crime_count, color = "Rolling Crime Count"), linewidth = 0.25) + 
  geom_line(aes(y = rolling_temperature / coeff, color = "Rolling Temperature"), linewidth = 0.25) + 
  labs(title = "Crime Count and Temperature Over Time (Rolling Mean)",
       x = "Date", 
       y = "Rolling Mean Value", 
       color = "Legend") +
  scale_color_manual(values = c("Rolling Crime Count" = "red", "Rolling Temperature" = "orange")) +
  scale_y_continuous(
    name = "Crime Count (Rolling Mean)",
            breaks = seq(0, max(merged_data$rolling_crime_count, na.rm = TRUE), by = 10),
    sec.axis = sec_axis(~ .*coeff, name = "Temperature (°C Rolling Mean)")
  ) +
  scale_x_date(
    breaks = seq(min(merged_data$date_occurrence), max(merged_data$date_occurrence), by = "6 months"),  
    labels = scales::date_format("%Y-%m") 
  ) +
  theme_minimal() +
  theme(axis.text.y.left = element_text(margin = margin(l = 10)),
        axis.text.y.right = element_text(margin = margin(r = 10)),
        axis.title.y.right = element_text(angle = 90),
        legend.title = element_blank())

Create a simple linear regression for temperatures and crime count

lm_model <- lm(crime_count ~ temp, data = merged_data)
summary(lm_model)
## 
## Call:
## lm(formula = crime_count ~ temp, data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.155  -6.507  -1.221   4.910  59.647 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42.0108     1.2590  33.369  < 2e-16 ***
## temp          0.2199     0.0676   3.253  0.00117 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.21 on 1459 degrees of freedom
## Multiple R-squared:  0.007199,   Adjusted R-squared:  0.006518 
## F-statistic: 10.58 on 1 and 1459 DF,  p-value: 0.00117
install.packages("olsrr")
## 
## The downloaded binary packages are in
##  /var/folders/xk/9mmr645s72g8yf1651fpqg740000gn/T//RtmpUGS7x8/downloaded_packages
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
lm_model <- lm(crime_count ~ temp, data = merged_data)
ols_regress(lm_model)
##                           Model Summary                           
## -----------------------------------------------------------------
## R                       0.085       RMSE                  10.205 
## R-Squared               0.007       MSE                  104.136 
## Adj. R-Squared          0.007       Coef. Var             22.193 
## Pred R-Squared          0.004       AIC                10939.502 
## MAE                     7.509       SBC                10955.363 
## -----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                  ANOVA                                   
## ------------------------------------------------------------------------
##                   Sum of                                                
##                  Squares          DF    Mean Square      F         Sig. 
## ------------------------------------------------------------------------
## Regression      1103.145           1       1103.145    10.579    0.0012 
## Residual      152142.633        1459        104.279                     
## Total         153245.778        1460                                    
## ------------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)    42.011         1.259                 33.369    0.000    39.541    44.480 
##        temp     0.220         0.068        0.085     3.253    0.001     0.087     0.352 
## ----------------------------------------------------------------------------------------

Interpret the variable (magnitude, significant, etc.)

1. Regression Equation

The estimated regression equation is:

y = 42.0108 + 0.2199 x temp

  • y = Predicted crime count

  • Intercept (42.0108): When temperature = 0°C, the expected crime count is approximately 42 crimes per day

  • Slope (0.2199): For each 1°C increase in temperature, the crime count increases by 0.22 crimes per day on average

2. Statistical Significance

  • The p-value for temperature = 0.00117 tells us whether the relationship is statistically significant. A p-value this small (0.117%) means that there is very strong evidence against the null hypothesis, which suggests that temperature does have a real relationship with crime count. In other words, the likelihood of observing this result by chance is only 0.117%, making this a statistically significant relationship
  • The t-value = 3.253, which measures how many standard errors the estimated coefficient is away from zero, is relatively high. A higher t-value indicates a stronger effect, and since it’s well above the typical critical values (usually around 2 or greater), this further supports that temperature has a statistically significant impact on crime count
  • The intercept is < 2e-16, meaning the baseline crime count (when temperature is zero) is well estimated by the model, with very low chance of this result being random.

3. Goodness of Fit (R-squared)

  • The Multiple R-squared = 0.007199 and Adjusted R-squared = 0.006518 suggest that only 0.72% of the variation in crime count is explained by temperature

4. Residual Analysis

  • The Residual Standard Error (10.21) means that the predicted crime count can deviate from the actual value by roughly ±10 crimes per day
  • The residuals range from -26.16 to 59.65, showing that crime count has a wide spread that temperature alone does not fully capture

5. Conclusion

Temperature significantly affects crime count, but the effect size is small (0.22 crimes per °C). The model is weak at predicting crime count (low R² ≈ 0.72%), meaning other factors must be included for better accuracy, such as day of the week, seasonality, holidays, or socioeconomic factors.

Is it causal?

A relationship is causal if changing one variable directly causes a change in another.

The analysis suggests that temperature and crime count are associated, but correlation does not imply causation:

  • The relationship is in fact statistically significant, but not high (p-value = 0.00117)
  • Only 0.72% of crime count variability is explained by temperature (R² ≈ 0.72%). If temperature were a strong causal factor, R² would be much higher

Include more variables in the regression

Adding variables: “feelslike”, “humidity”, “windspeed” to the regression

lm_model_multi <- lm(crime_count ~ temp + feelslike + humidity + windspeed, data = merged_data)
summary(lm_model_multi)
## 
## Call:
## lm(formula = crime_count ~ temp + feelslike + humidity + windspeed, 
##     data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.938  -6.547  -1.164   4.923  59.615 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.745116   1.831131  22.797   <2e-16 ***
## temp        -2.301172   1.839047  -1.251    0.211    
## feelslike    2.514220   1.835962   1.369    0.171    
## humidity    -0.001949   0.019277  -0.101    0.919    
## windspeed    0.041389   0.054268   0.763    0.446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.21 on 1456 degrees of freedom
## Multiple R-squared:  0.008958,   Adjusted R-squared:  0.006236 
## F-statistic:  3.29 on 4 and 1456 DF,  p-value: 0.01075
lm_model_multi <- lm(crime_count ~ temp + feelslike + humidity + windspeed, data = merged_data)
ols_regress(lm_model_multi)
##                           Model Summary                           
## -----------------------------------------------------------------
## R                       0.095       RMSE                  10.196 
## R-Squared               0.009       MSE                  103.951 
## Adj. R-Squared          0.006       Coef. Var             22.197 
## Pred R-Squared          0.002       AIC                10942.910 
## MAE                     7.504       SBC                10974.631 
## -----------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                  ANOVA                                  
## -----------------------------------------------------------------------
##                   Sum of                                               
##                  Squares          DF    Mean Square      F        Sig. 
## -----------------------------------------------------------------------
## Regression      1372.837           4        343.209     3.29    0.0107 
## Residual      151872.942        1456        104.308                    
## Total         153245.778        1460                                   
## -----------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)    41.745         1.831                 22.797    0.000    38.153    45.337 
##        temp    -2.301         1.839       -0.888    -1.251    0.211    -5.909     1.306 
##   feelslike     2.514         1.836        0.972     1.369    0.171    -1.087     6.116 
##    humidity    -0.002         0.019       -0.003    -0.101    0.919    -0.040     0.036 
##   windspeed     0.041         0.054        0.020     0.763    0.446    -0.065     0.148 
## ----------------------------------------------------------------------------------------

Inteprete the variables

1. Regression Equation

The estimated regression equation is:

y = 41.75 − 2.30 × temp + 2.51 × feelslike − 0.0019 × humidity + 0.0414 × windspeed

  • Intercept (41.75): When all weather variables are at zero, the predicted crime count is approximately 41.75 crimes per day
  • Temperature (-2.30): Holding other factors constant, a 1°C increase in temperature decreases crime count by 2.30 crimes per day, though this result is not statistically significant (p = 0.211)
  • Feels-like Temperature (2.51): A 1°C increase in the perceived temperature increases crime count by 2.51 crimes per day, but this effect is not statistically significant (p = 0.171)
  • Humidity (-0.0019): Humidity has a negligible impact on crime count, with an estimated decrease of 0.0019 crimes per percentage increase in humidity, and is not significant (p = 0.919)
  • Wind Speed (0.0414): A 1 km/h increase in wind speed is associated with an increase of 0.0414 crimes per day, but this effect is not significant (p = 0.446)

2. Statistical Significance

  • None of the predictors are statistically significant, as all p-values are greater than 0.05. This means we cannot confidently conclude that temperature, feels-like temperature, humidity, or wind speed have a real impact on crime count in this dataset
  • The F-statistic = 3.29 and p = 0.01075 suggest that, as a group, these weather variables do have some influence on crime count, but the individual effects are weak

3. Goodness of Fit (R-squared)

  • The Multiple R-squared = 0.00896 suggest that only 0.90% of the variance in crime count is explained by these weather factors
  • The Adjusted R-squared = 0.00624 means after adjusting for the number of predictors, the explanatory power remains very low
  • This means the model has very poor predictive ability, suggesting that other unmeasured factors are driving crime rates

4. Residual Analysis

  • The Residual Standard Error = 10.21 shows that the model’s predictions deviate from actual crime counts by about ±10 crimes per day on average
  • The Residuals range from -25.94 to 59.61 indicates that actual crime counts can vary widely around the predicted values

5. Conclusion

  • None of the weather variables have a statistically significant impact on crime count. In the first regression (where crime count was only predicted by temperature), temperature was statistically significant with p = 0.00117. However, in the multiple regression (including feels-like temperature, humidity, and wind speed), temperature is no longer significant (p = 0.211)
  • The model explains less than 1% of the variance in crime count, meaning it lacks predictive power

Should we include the variable temp and feelslike at the same time? Why or why not?

The feelslike (perceived temperature) is not a completely independent variable—it is calculated based on temp (actual temperature), along with humidity and wind speed.

  • Different weather agencies use slightly different formulas, but generally, the heat index (for hot weather) and the wind chill index (for cold weather) are used
  • In hot weather: High humidity makes it feel hotter because sweat evaporates more slowly, therefore, feelslike is higher than temp when humidity is high
  • In cold weather: High windspeed makes it feel colder because it increases heat loss from the skin, therfore, feelslike is lower than temp when windspeed is high

Considering that, we should not include both temp and feelslike in the same regression model. *Since the two variables are highly correlated (feelslike is mathematically derived from temp, humidity, and windspeed), including both temp and feelslike in a regression model can cause multicollinearity (high correlation between independent variables).

  • Multicollinearity distorts coefficient estimates
  • When two variables are strongly correlated, the model struggles to determine their individual effects, making their coefficients unstable and less reliable
  • In this analysis, feelslike is derived from temp using humidity and windspeed, meaning they provide redundant information.

We saw this in the regression results: when both variables temp and feelslike were included, neither was statistically significant (p-values > 0.05). This happens because their shared variance causes the model to distribute the explanatory power between them, weakening their individual effects.

High correlation also leads to inflated standard errors for both variables, reducing our confidence in their estimated effects.

Is it causal now?

The analysis shows that weather variables and crime count are correlated, but not causal

  • The relationship is not statistically significant for temperature (p = 0.211) or other weather factors
  • The model explains only 0.62% of crime variability (R² ≈ 0.0062), meaning weather alone is a weak predictor

Further graph and analysis

🗺📍 Distribution of Homicide in the Los Angeles area using (1) ggplot and (2) ggmap & stadiamaps

Using map plot to geographically represent the distribution of homicide in the Los Angeles area (Location: California county, United States of America)

library(ggplot2)
library(dplyr)
library(usmap)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(mapdata)
library(ggplot2)
library(dplyr)
#Plot the map of California county
usa_county <- map_data("county") 
usa_county_map <- usa_county %>%
  rename(longitude = long,
         latitude = lat)
california <- subset(usa_county_map, region=="california")

#Filter the dataset to select only homicide crimes
dataset_homicide <- dataset_crime_cleaned[dataset_crime_cleaned$category == "HOMICIDE", ]

#Merge homicide dataset with map of California by geographic coordinates
mapdata <- left_join(california, dataset_homicide, by = c("longitude", "latitude"))

#Plot the map of distribution of homicide in Los Angeles area
ggplot(mapdata, aes(x = longitude, y = latitude, group = group)) +
    geom_polygon(data = california, aes(x = longitude, y = latitude, group = group),
                   color = "black", linewidth = 0.2, fill = "white") +
    geom_point(data = dataset_homicide, aes(x = longitude, y = latitude, group = NULL), 
               fill = "red", color = "black", shape = 21, size = 2) +
  
    #Limit the area to Los Angeles
    coord_cartesian(xlim = c(-119.5, -117.0), ylim = c(33.7, 34.4)) +
  
    labs(title = "Distribution of Homicide in Los Angeles Area",
         x = "",
         y = "",
         color = "") +
    
    theme_minimal()

library(ggplot2)
library(dplyr)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
register_stadiamaps(key = "c9527f0a-1729-42db-ac61-8dfe590e5e5f")

# Define bounding box for Los Angeles
la_bbox <- c(left = -118.75, bottom = 33.7, right = -118.0, top = 34.4)

# Get OpenStreetMap tile for the Los Angeles area from Stadia Maps
la_map <- get_stadiamap(bbox = la_bbox, zoom = 11, maptype = "stamen_toner_lite")
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
# Filter dataset to select only homicide crimes in Los Angeles
dataset_homicide <- dataset_crime_cleaned %>%
  filter(category == "HOMICIDE") %>%
  filter(longitude >= la_bbox["left"] & longitude <= la_bbox["right"], 
         latitude >= la_bbox["bottom"] & latitude <= la_bbox["top"])

# Plot the detailed street map with homicide locations
ggmap(la_map) +
  geom_point(data = dataset_homicide, aes(x = longitude, y = latitude), 
             fill = "red", color = "black", shape = 21, size = 2) +
  labs(title = "Distribution of Homicide in Los Angeles Area",
       x = "",
       y = "") +
  theme_minimal() 

🌍📈 Distribution of Top 8 most committed crimes in Los Angeles using interactive map in plotly

install.packages("plotly")
## 
## The downloaded binary packages are in
##  /var/folders/xk/9mmr645s72g8yf1651fpqg740000gn/T//RtmpUGS7x8/downloaded_packages
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Count the occurrences for each crime category and filter the top 8
top_8_crimes <- dataset_crime_cleaned %>%
  count(category) %>%
  arrange(desc(n)) %>%
  head(8)

# Extract the category names from top_8_crimes
top_8_categories <- top_8_crimes$category

# Filter the original dataset for only these top 8 categories
df <- dataset_crime_cleaned %>%
  filter(category %in% top_8_categories)

m <- list(colorbar = list(title = "category"))

# Set up map layout
g <- list(
  scope = 'america',
  showland = TRUE,
  landcolor = toRGB("grey83"),
  subunitcolor = toRGB("white"),
  countrycolor = toRGB("white"),
  showlakes = TRUE,
  lakecolor = toRGB("white"),
  showsubunits = TRUE,
  showcountries = TRUE,
  resolution = 50,
  projection = list(
    type = 'conic conformal',
    rotation = list(lon = -118.25)
  ),
  lonaxis = list(
    showgrid = TRUE,
    gridwidth = 0.5,
    range = c(-119, -117.8),
    dtick = 0.2
    ),  
  lataxis = list(
    showgrid = TRUE,
    gridwidth = 0.5,
    range = c(33.5, 34.5),
    dtick = 0.2
    )
)

# Create interactive map
fig <- plot_geo(df, lat = ~latitude, lon = ~longitude, color = ~category) 

# Add crime markers
fig <- fig %>% add_markers(
  text = ~paste("Crime:", df$category, "<br>Description:", df$crime_code_description, "<br>Location", df$area_name),  
  hoverinfo = "text")

# Final layout
fig <- fig %>% layout(title = "Crime Distribution in Los Angeles (Hover for Details)", geo = g)

fig

Thank you for reading! 🕵️️🔍📝🚔🚨📊📈